This Jupyter notebook is intented to be used alongside the book Python for Bioinformatics
Note: Before opening the file, this file should be accesible from this Jupyter notebook. In order to do so, the following commands will download these files from Github and extract them into a directory called samples.
In [1]:
!curl https://raw.githubusercontent.com/Serulab/Py4Bio/master/samples/samples.tar.bz2 -o samples.tar.bz2
!mkdir samples
!tar xvfj samples.tar.bz2 -C samples
SAX: cElementTree Iterparse
In [2]:
import xml.etree.cElementTree as cET
for event, elem in cET.iterparse('samples/uniprotrecord.xml',
events=('start', 'end')):
if event=='end' and 'sequence' in elem.tag:
print('Sequence: {0}'.format(elem.text))
print('Checksum: {0}'.format(elem.attrib["checksum"]))
print('Length: {0}'.format(elem.attrib["length"]))
elem.clear()
In [3]:
for event, elem in cET.iterparse('samples/uniprotrecord.xml'):
if 'sequence' in elem.tag:
print('Sequence: {0}'.format(elem.text))
print('Checksum: {0}'.format(elem.attrib["checksum"]))
print('Length: {0}'.format(elem.attrib["length"]))
elem.clear()
In [4]:
allelements = cET.iterparse('samples/uniprotrecord.xml', events=('start','end'))
allelements = iter(allelements)
event, root = next(allelements)
In [5]:
for event, elem in allelements:
if event=='end' and 'sequence' in elem.tag:
print(elem.text)
root.clear()
In [6]:
from bs4 import BeautifulSoup as bs
soup = bs(open('samples/uniprotrecord.xml'), 'lxml')
In [7]:
import requests
url = 'https://s3.amazonaws.com/py4bio/uniprotrecord.xml'
req = requests.get(url)
c = req.content
In [8]:
from bs4 import BeautifulSoup as bs
soup = bs(c, 'lxml')
In [9]:
soup.sequence
Out[9]:
In [10]:
soup.sequence.string
Out[10]:
In [11]:
soup.sequence.get('checksum')
Out[11]:
In [12]:
soup.sequence.get('length')
Out[12]:
In [13]:
for taxon in soup.lineage.children:
if taxon.string != '\n':
print(taxon.string)
In [14]:
print('Sequence: {0}'.format(soup.sequence.string))
In [15]:
print('Checksum: {0}'.format(soup.sequence.get('checksum')))
In [16]:
print('Length: {0}'.format(soup.sequence.get('length')))